Aidemy Pandas基礎
Pandas
Pandasを用いたCSVの読み込み
code: Python
import pandas as pd
df = pd.read_csv("cleansing_data/iris.csv")
df
CSVライブラリを用いたCSVの作成
code: Python
import csv
with open("./cleansing_data/csv0.csv", "w") as csvfile:
writer = csv.writer(csvfile, lineterminator="\n")
Pandasを用いたCSVの作成
code: Python
import pandas as pd
df = pd.DataFrame(data)
df.to_csv("OSlist.csv")
--------------------------------------------------------------------------
,OS,release,country
0,Machintosh,1984,US
1,Windows,1985,US
2,Linux,1991,
--------------------------------------------------------------------------
DataFrameの復習
code: Python
import pandas as pd
from pandas import Series, DataFrame
attri_data1 = {"ID": "100", "101", "102", "103", "104", "106", "108", "110", "111", "113", "city": "Tokyo", "Osaka", "Kyoto", "Hokkaido", "Tokyo", "Tokyo", "Osaka", "Kyoto", "Hokkaido", "Tokyo", "birth_year": 1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990, 1995, 1981, attri_data_frame1 = DataFrame(attri_data1)
attri_data_frame2 = DataFrame(attri_data2)
attri_data_frame1.append(attri_data_frame2).sort_values(
by="ID", ascending=True).reset_index(drop=True)
リストワイズ/ペアワイズ削除
code: Python
# データ欠損のある行(NaNを含む行)をまるごと消去することをリストワイズ削除という
# 欠損の少ない列(例: 0列目と1列目)だけを残すことをペアワイズ削除という
import numpy as np
from numpy import nan as NA
import pandas as pd
np.random.seed(0)
sample_data_frame = pd.DataFrame(np.random.rand(10, 4))
sample_data_frame.iloc1, 0 = NA sample_data_frame.iloc2, 2 = NA sample_data_frame.iloc5:, 3 = NA sample_data_frame0, 2.dropna()
--------------------------------------------------------------------------
0 2
0 0.548814 0.602763
3 0.568045 0.071036
4 0.020218 0.778157
5 0.978618 0.461479
6 0.118274 0.143353
7 0.521848 0.264556
8 0.456150 0.018790
9 0.612096 0.943748
--------------------------------------------------------------------------
欠損値の補完
code: Python
# fillna()を用いると、引数として与えた数をNaNの部分に代入する
# methodにffillを指定することで前の値で埋めることができる
import numpy as np
from numpy import nan as NA
import pandas as pd
np.random.seed(0)
sample_data_frame = pd.DataFrame(np.random.rand(10, 4))
sample_data_frame.iloc1, 0 = NA sample_data_frame.iloc6:, 2 = NA sample_data_frame.fillna(method="ffill")
--------------------------------------------------------------------------
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.548814 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.791725 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
5 0.978618 0.799159 0.461479 0.780529
6 0.118274 0.639921 0.461479 0.944669
7 0.521848 0.414662 0.461479 0.774234
8 0.456150 0.568434 0.461479 0.617635
9 0.612096 0.616934 0.461479 0.681820
--------------------------------------------------------------------------
欠損値の補完(平均値代入法)
code: Python
# 欠損値をその列(または行)の平均値によって穴埋めをする方法を平均値代入法という
import numpy as np
from numpy import nan as NA
import pandas as pd
np.random.seed(0)
sample_data_frame = pd.DataFrame(np.random.rand(10, 4))
sample_data_frame.iloc1, 0 = NA sample_data_frame.iloc6:, 2 = NA sample_data_frame.fillna(sample_data_frame.mean())
--------------------------------------------------------------------------
0 1 2 3
0 0.548814 0.715189 0.602763 0.544883
1 0.531970 0.645894 0.437587 0.891773
2 0.963663 0.383442 0.791725 0.528895
3 0.568045 0.925597 0.071036 0.087129
4 0.020218 0.832620 0.778157 0.870012
5 0.978618 0.799159 0.461479 0.780529
6 0.118274 0.639921 0.523791 0.944669
7 0.521848 0.414662 0.523791 0.774234
8 0.456150 0.568434 0.523791 0.617635
9 0.612096 0.616934 0.523791 0.681820
--------------------------------------------------------------------------
キーごとの統計量の算出
code: Python
import pandas as pd
df = pd.read_csv("cleansing_data/wine.csv", header=None)
df.columns = "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"
--------------------------------------------------------------------------
--------------------------------------------------------------------------
重複データ
code: Python
import pandas as pd
from pandas import DataFrame
dupli_data = DataFrame({"col1":1, 1, 2, 3, 4, 4, 6, 6, 7, 7, 7, 8, 9, 9 ,"col2":"a", "b", "b", "b", "c", "c", "b", "b", "d", "d", "c", "b", "c", "c"}) dupli_data.drop_duplicates()
--------------------------------------------------------------------------
col1 col2
0 1 a
1 1 b
2 2 b
3 3 b
4 4 c
6 6 b
8 7 d
10 7 c
11 8 b
12 9 c
--------------------------------------------------------------------------
マッピング
code: Python
import pandas as pd
from pandas import DataFrame
,"city": "Tokyo", "Osaka", "Kyoto", "Hokkaido", "Tokyo", "Tokyo", "Osaka", "Kyoto", "Hokkaido", "Tokyo" ,"birth_year" :1990, 1989, 1992, 1997, 1982, 1991, 1988, 1990, 1995, 1981 ,"name" :"Hiroshi", "Akiko", "Yuki", "Satoru", "Steeve", "Mituru", "Aoi", "Tarou", "Suguru", "Mitsuo"} attri_data_frame1 = DataFrame(attri_data1)
map_data = {"Tokyo":"east", "Hokkaido":"east", "Osaka":"west", "Kyoto":"west"}
attri_data_frame1"WE" = attri_data_frame1"city".map(map_data) attri_data_frame1
--------------------------------------------------------------------------
ID city birth_year name WE
0 100 Tokyo 1990 Hiroshi east
1 101 Osaka 1989 Akiko west
2 102 Kyoto 1992 Yuki west
3 103 Hokkaido 1997 Satoru east
4 104 Tokyo 1982 Steeve east
5 106 Tokyo 1991 Mituru east
6 108 Osaka 1988 Aoi west
7 110 Kyoto 1990 Tarou west
8 111 Hokkaido 1995 Suguru east
9 113 Tokyo 1981 Mitsuo east
--------------------------------------------------------------------------
ビン分割
code: Python
# ビニング処理(ビン分割)とは、連続値を任意の境界値で区切りカテゴリ分けして離散値に変換する処理のこと
import pandas as pd
from pandas import DataFrame
,"city":"Tokyo","Osaka","Kyoto","Hokkaido","Tokyo","Tokyo","Osaka","Kyoto","Hokkaido","Tokyo" ,"birth_year":1990,1989,1992,1997,1982,1991,1988,1990,1995,1981 ,"name":"Hiroshi","Akiko","Yuki","Satoru","Steeve","Mituru","Aoi","Tarou","Suguru","Mitsuo"} attri_data_frame1 = DataFrame(attri_data1)
pd.cut(attri_data_frame1.ID, 2)
--------------------------------------------------------------------------
0 (99.987, 106.5]
1 (99.987, 106.5]
2 (99.987, 106.5]
3 (99.987, 106.5]
4 (99.987, 106.5]
5 (99.987, 106.5]
6 (106.5, 113.0]
7 (106.5, 113.0]
8 (106.5, 113.0]
9 (106.5, 113.0]
Name: ID, dtype: category
--------------------------------------------------------------------------